
# Figure 2 #
############

# Comparing scans #

source('D:/Pipeline comparisons/Writing/R Functions/CompScansAllpASTYtoAllpSTY.R')
source('D:/Pipeline comparisons/Writing/R Functions/MQCompScansAllpASTYtoAllpSTY2.R')
source('D:/Pipeline comparisons/Writing/R Functions/PD_CompScansAllpASTYtoAllpSTY.R')

#> [conflicted] Will prefer dplyr::filter over any other package
suppressPackageStartupMessages(library("tidyverse"))

library(dplyr)
library(stringr)
library(useful)
library(MASS)
library(reshape2)
library(epiDisplay)

 #########################################                  TPP                     #################################

TPP_PXD000923 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD000923_PSMSITE.csv")
TPP_PXD002222 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD002222_PSMSITE.csv")
TPP_PXD002756 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD002756_PSMSITE.csv")
TPP_PXD004705 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD004705_PSMSITE.csv")
TPP_PXD004939 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD004939_PSMSITE.csv")
TPP_PXD005241 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD005241_PSMSITE.csv")
TPP_PXD012764 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD012764_PSMSITE.csv")
TPP_PXD019291 <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD019291_PSMSITE.csv")

TPP_PXD000923A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD000923_A_PSMSITE.csv")
TPP_PXD002222A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD002222_A_PSMSITE.csv")
TPP_PXD002756A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD002756_A_PSMSITE.csv")
TPP_PXD004705A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD004705_A_PSMSITE.csv")
TPP_PXD004939A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD004939_A_PSMSITE.csv")
TPP_PXD005241A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD005241_A_PSMSITE.csv")
TPP_PXD012764A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD012764_A_PSMSITE.csv")
TPP_PXD019291A <- read.csv( "D:/Pipeline comparisons/Writing/Data/TPP/Rice/TPP_PXD019291_A_PSMSITE.csv")

TPP_Merged_PXD000923 <- CompScans_AlltoAll(TPP_PXD000923A,TPP_PXD000923)
TPP_Merged_PXD002222 <- CompScans_AlltoAll(TPP_PXD002222A,TPP_PXD002222)
TPP_Merged_PXD002756 <- CompScans_AlltoAll(TPP_PXD002756A,TPP_PXD002756)
TPP_Merged_PXD004705 <- CompScans_AlltoAll(TPP_PXD004705A,TPP_PXD004705)
TPP_Merged_PXD004939 <- CompScans_AlltoAll(TPP_PXD004939A,TPP_PXD004939)
TPP_Merged_PXD005241 <- CompScans_AlltoAll(TPP_PXD005241A,TPP_PXD005241)
TPP_Merged_PXD012764 <- CompScans_AlltoAll(TPP_PXD012764A,TPP_PXD012764)
TPP_Merged_PXD019291 <- CompScans_AlltoAll(TPP_PXD019291A,TPP_PXD019291)

TPP_Merged_PXD000923$Dataset <- "PXD000923"
TPP_Merged_PXD002222$Dataset <- "PXD002222"
TPP_Merged_PXD002756$Dataset <- "PXD002756"
TPP_Merged_PXD004705$Dataset <- "PXD004705"
TPP_Merged_PXD004939$Dataset <- "PXD004939"
TPP_Merged_PXD005241$Dataset <- "PXD005241"
TPP_Merged_PXD012764$Dataset <- "PXD012764"
TPP_Merged_PXD019291$Dataset <- "PXD019291"


TPP_All_data <- dplyr::bind_rows(TPP_Merged_PXD000923, TPP_Merged_PXD002222, TPP_Merged_PXD002756, TPP_Merged_PXD004705, TPP_Merged_PXD004939, TPP_Merged_PXD005241,
                             TPP_Merged_PXD012764, TPP_Merged_PXD019291)

Score_plot <- ggplot(TPP_All_data, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot2 <- ggplot(TPP_All_data, aes(x=pASTY_PSMScore, y=pSTY_PSMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot2 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PSMScore, pSTY_PSMScore, use="complete.obs"))

Score_plot3 <- ggplot(TPP_All_data, aes(x=pASTY_PTMScore, y=pSTY_PTMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot3 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMScore, pSTY_PTMScore, use="complete.obs"))


# Removing no choice peptides #
###############################


TPP_All_data$num_PTM_Y <- str_count(TPP_All_data$Sequence, "Y")
TPP_All_data$num_PTM_S <- str_count(TPP_All_data$Sequence, "S")
TPP_All_data$num_PTM_T <- str_count(TPP_All_data$Sequence, "T")
TPP_All_data$nSTY <- TPP_All_data$num_PTM_Y + TPP_All_data$num_PTM_S + TPP_All_data$num_PTM_T

TPP_All_data_NNC <- TPP_All_data[(TPP_All_data$nSTY - TPP_All_data$nPhos) > 0 & !is.na(TPP_All_data$nSTY - TPP_All_data$nPhos), ]
TPP_All_data_NC <-  TPP_All_data[(TPP_All_data$nSTY - TPP_All_data$nPhos) == 0, ]
TPP_All_data_NC <-  TPP_All_data_NC[!is.na(TPP_All_data_NC$Sequence),]

TPP_All_data_check <- TPP_All_data[is.na(TPP_All_data$nSTY - TPP_All_data$nPhos), ]


Score_plot4 <- ggplot(TPP_All_data_NNC, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot4 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot5 <- ggplot(TPP_All_data_NNC, aes(x=pASTY_PSMScore, y=pSTY_PSMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot5 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PSMScore, pSTY_PSMScore, use="complete.obs"))

Score_plot6 <- ggplot(TPP_All_data_NNC, aes(x=pASTY_PTMScore, y=pSTY_PTMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot6 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMScore, pSTY_PTMScore, use="complete.obs"))



# Peptides that don't contain Alanine #
######################################

TPP_All_data$num_A <- str_count(TPP_All_data$Sequence, "A")
TPP_All_NoAla <- TPP_All_data[!TPP_All_data$num_A>0,]

Score_plot7 <- ggplot(TPP_All_NoAla, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot7 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot8 <- ggplot(TPP_All_NoAla, aes(x=pASTY_PSMScore, y=pSTY_PSMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot8 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PSMScore, pSTY_PSMScore, use="complete.obs"))

Score_plot9 <- ggplot(TPP_All_NoAla, aes(x=pASTY_PTMScore, y=pSTY_PTMScore)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot9 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

TPP_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMScore, pSTY_PTMScore, use="complete.obs"))




# TPP_All_data_NNC$Diff <- TPP_All_data_NNC$pASTY_PTMScore - TPP_All_data_NNC$pSTY_PTMScore

#########################################                  MQ                     #################################

MQ_PXD000923 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD000923_PSMSITE.csv")
MQ_PXD002222 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD002222_PSMSITE.csv")
MQ_PXD002756 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD002756_PSMSITE.csv")
MQ_PXD004705 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD004705_PSMSITE.csv")
MQ_PXD004939 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD004939_PSMSITE.csv")
MQ_PXD005241 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD005241_PSMSITE.csv")
MQ_PXD012764 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD012764_PSMSITE.csv")
MQ_PXD019291 <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD019291_PSMSITE.csv")

MQ_PXD000923A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD000923_A_PSMSITE.csv")
MQ_PXD002222A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD002222_A_PSMSITE.csv")
MQ_PXD002756A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD002756_A_PSMSITE.csv")
MQ_PXD004705A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD004705_A_PSMSITE.csv")
MQ_PXD004939A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD004939_A_PSMSITE.csv")
MQ_PXD005241A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD005241_A_PSMSITE.csv")
MQ_PXD012764A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD012764_A_PSMSITE.csv")
MQ_PXD019291A <- read.csv("D:/Pipeline comparisons/Writing/Data/MQ/Rice/MQ_PXD019291_A_PSMSITE.csv")

MQ_Merged_PXD000923 <- MQ_CompScans_AlltoAll(MQ_PXD000923A,MQ_PXD000923)
MQ_Merged_PXD002222 <- MQ_CompScans_AlltoAll(MQ_PXD002222A,MQ_PXD002222)
MQ_Merged_PXD002756 <- MQ_CompScans_AlltoAll(MQ_PXD002756A,MQ_PXD002756)
MQ_Merged_PXD004705 <- MQ_CompScans_AlltoAll(MQ_PXD004705A,MQ_PXD004705)
MQ_Merged_PXD004939 <- MQ_CompScans_AlltoAll(MQ_PXD004939A,MQ_PXD004939)
MQ_Merged_PXD005241 <- MQ_CompScans_AlltoAll(MQ_PXD005241A,MQ_PXD005241)
MQ_Merged_PXD012764 <- MQ_CompScans_AlltoAll(MQ_PXD012764A,MQ_PXD012764)
MQ_Merged_PXD019291 <- MQ_CompScans_AlltoAll(MQ_PXD019291A,MQ_PXD019291)

MQ_Merged_PXD005241$diff <- MQ_Merged_PXD005241$pASTY_Score-MQ_Merged_PXD005241$pSTY_Score

MQ_Merged_PXD000923$Dataset <- "PXD000923"
MQ_Merged_PXD002222$Dataset <- "PXD002222"
MQ_Merged_PXD002756$Dataset <- "PXD002756"
MQ_Merged_PXD004705$Dataset <- "PXD004705"
MQ_Merged_PXD004939$Dataset <- "PXD004939"
MQ_Merged_PXD005241$Dataset <- "PXD005241"
MQ_Merged_PXD012764$Dataset <- "PXD012764"
MQ_Merged_PXD019291$Dataset <- "PXD019291"


MQ_All_data <- dplyr::bind_rows(MQ_Merged_PXD000923, MQ_Merged_PXD002222, MQ_Merged_PXD002756, MQ_Merged_PXD004705, MQ_Merged_PXD004939, MQ_Merged_PXD005241,
                                 MQ_Merged_PXD012764, MQ_Merged_PXD019291)


Score_plot <- ggplot(MQ_All_data, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot2 <- ggplot(MQ_All_data, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot2 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))


Score_plot3 <- ggplot(MQ_All_data, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot3 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))

# Removing no choice peptides #
###############################


MQ_All_data$num_PTM_Y <- str_count(MQ_All_data$Sequence, "Y")
MQ_All_data$num_PTM_S <- str_count(MQ_All_data$Sequence, "S")
MQ_All_data$num_PTM_T <- str_count(MQ_All_data$Sequence, "T")
MQ_All_data$nSTY <- MQ_All_data$num_PTM_Y + MQ_All_data$num_PTM_S + MQ_All_data$num_PTM_T

MQ_All_data_NNC <- MQ_All_data[(MQ_All_data$nSTY - MQ_All_data$nPhos) > 0 & !is.na(MQ_All_data$nSTY - MQ_All_data$nPhos), ]
MQ_All_data_NC <-  MQ_All_data[(MQ_All_data$nSTY - MQ_All_data$nPhos) == 0, ]
MQ_All_data_NC <-  MQ_All_data_NC[!is.na(MQ_All_data_NC$Sequence),]

MQ_All_data_check <- MQ_All_data[is.na(MQ_All_data$nSTY - MQ_All_data$nPhos), ]

Score_plot4 <- ggplot(MQ_All_data_NNC, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot4 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot5 <- ggplot(MQ_All_data_NNC, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot5 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))

Score_plot6 <- ggplot(MQ_All_data_NNC, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot6 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))


# Peptides that don't contain Alanine #
######################################

MQ_All_data$num_A <- str_count(MQ_All_data$Sequence, "A")
MQ_All_NoAla <- MQ_All_data[!MQ_All_data$num_A>0,]
MQ_All_NoAla <- MQ_All_NoAla[!is.na(MQ_All_NoAla$Unique_scan),]


Score_plot7 <- ggplot(MQ_All_NoAla, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot7 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))


Score_plot8 <- ggplot(MQ_All_NoAla, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot8 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))


Score_plot9 <- ggplot(MQ_All_NoAla, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot9 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

MQ_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))


#########################################                  PD                     #################################

PD_PXD000923 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD000923_PSMSITE.csv")
PD_PXD002222 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD002222_PSMSITE.csv")
PD_PXD002756 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD002756_PSMSITE.csv")
PD_PXD004705 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD004705_PSMSITE.csv")
PD_PXD004939 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD004939_PSMSITE.csv")
PD_PXD005241 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD005241_PSMSITE.csv")
PD_PXD012764 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD012764_PSMSITE.csv")
PD_PXD019291 <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD019291_PSMSITE.csv")

PD_PXD000923A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD000923_A_PSMSITE.csv")
PD_PXD002222A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD002222_A_PSMSITE.csv")
PD_PXD002756A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD002756_A_PSMSITE.csv")
PD_PXD004705A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD004705_A_PSMSITE.csv")
PD_PXD004939A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD004939_A_PSMSITE.csv")
PD_PXD005241A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD005241_A_PSMSITE.csv")
PD_PXD012764A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD012764_A_PSMSITE.csv")
PD_PXD019291A <- read.csv( "D:/Pipeline comparisons/Writing/Data/PD/Rice/PD_PXD019291_A_PSMSITE.csv")

PD_Merged_PXD000923 <- PD_CompScans_AlltoAll(PD_PXD000923A,PD_PXD000923)
PD_Merged_PXD002222 <- PD_CompScans_AlltoAll(PD_PXD002222A,PD_PXD002222)
PD_Merged_PXD002756 <- PD_CompScans_AlltoAll(PD_PXD002756A,PD_PXD002756)
PD_Merged_PXD004705 <- PD_CompScans_AlltoAll(PD_PXD004705A,PD_PXD004705)
PD_Merged_PXD004939 <- PD_CompScans_AlltoAll(PD_PXD004939A,PD_PXD004939)
PD_Merged_PXD005241 <- PD_CompScans_AlltoAll(PD_PXD005241A,PD_PXD005241)
PD_Merged_PXD012764 <- PD_CompScans_AlltoAll(PD_PXD012764A,PD_PXD012764)
PD_Merged_PXD019291 <- PD_CompScans_AlltoAll(PD_PXD019291A,PD_PXD019291)

PD_Merged_PXD000923$Dataset <- "PXD000923"
PD_Merged_PXD002222$Dataset <- "PXD002222"
PD_Merged_PXD002756$Dataset <- "PXD002756"
PD_Merged_PXD004705$Dataset <- "PXD004705"
PD_Merged_PXD004939$Dataset <- "PXD004939"
PD_Merged_PXD005241$Dataset <- "PXD005241"
PD_Merged_PXD012764$Dataset <- "PXD012764"
PD_Merged_PXD019291$Dataset <- "PXD019291"


PD_All_data <- dplyr::bind_rows(PD_Merged_PXD000923, PD_Merged_PXD002222, PD_Merged_PXD002756, PD_Merged_PXD004705, PD_Merged_PXD004939, PD_Merged_PXD005241,
                                PD_Merged_PXD012764, PD_Merged_PXD019291)

Score_plot <- ggplot(PD_All_data, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot <- ggplot(PD_All_data, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))

Score_plot <- ggplot(PD_All_data, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))


boxplot(pASTY_PTMprob~Amino, PD_All_data)



# Removing no choice peptides #
###############################

PD_All_data$num_PTM_S <- str_count(PD_All_data$Sequence, "S")
PD_All_data$num_PTM_Y <- str_count(PD_All_data$Sequence, "Y")
PD_All_data$num_PTM_T <- str_count(PD_All_data$Sequence, "T")
PD_All_data$nSTY <- PD_All_data$num_PTM_Y + PD_All_data$num_PTM_S + PD_All_data$num_PTM_T


PD_All_data <- PD_All_data %>%
  group_by(Unique_scan) %>%
  dplyr::mutate(nPhos = n())


PD_All_data_NNC <- PD_All_data[(PD_All_data$nSTY - PD_All_data$nPhos) > 0 & !is.na(PD_All_data$nSTY - PD_All_data$nPhos), ]
PD_All_data_NC <-  PD_All_data[(PD_All_data$nSTY - PD_All_data$nPhos) == 0, ]
PD_All_data_NC <-  PD_All_data_NC[!is.na(PD_All_data_NC$Sequence)&!is.na(PD_All_data_NC$SequenceSTY),]

PD_All_data_check <- PD_All_data[is.na(PD_All_data$nSTY - PD_All_data$nPhos), ]

Score_plot4 <- ggplot(PD_All_data_NNC, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot4 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))

Score_plot5 <- ggplot(PD_All_data_NNC, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot5 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))

Score_plot6 <- ggplot(PD_All_data_NNC, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot6 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_data_NNC %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))


# Peptides that don't contain Alanine #
######################################

PD_All_data$num_A <- str_count(PD_All_data$Sequence, "A")
PD_All_NoAla <- PD_All_data[!PD_All_data$num_A>0,]
PD_All_NoAla <- PD_All_NoAla[!is.na(PD_All_NoAla$Unique_scan),]


Score_plot7 <- ggplot(PD_All_NoAla, aes(x=pASTY_Score, y=pSTY_Score)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot7 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_Score, pSTY_Score, use="complete.obs"))


Score_plot8 <- ggplot(PD_All_NoAla, aes(x=pASTY_PEP, y=pSTY_PEP)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot8 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PEP, pSTY_PEP, use="complete.obs"))


Score_plot9 <- ggplot(PD_All_NoAla, aes(x=pASTY_PTMprob, y=pSTY_PTMprob)) +
  geom_point(shape=5,color="steelblue", size=2, alpha = I(0.05)) +
  facet_wrap(~ Dataset) 
Score_plot9 + theme(axis.title = element_text(size = 12), strip.text.x = element_text(size = 12))

PD_All_NoAla %>%
  group_by(Dataset) %>%
  summarize(cor=cor(pASTY_PTMprob, pSTY_PTMprob, use="complete.obs"))




# Figure 3. Boxplot nochoice peptides

TPP_All_data_NC$Samino <- str_sub(substr(TPP_All_data_NC$SequenceSTY,1,TPP_All_data_NC$PTM.positionsSTY),-1)
MQ_All_data_NC$Samino <- str_sub(substr(MQ_All_data_NC$SequenceSTY,1,MQ_All_data_NC$PTM.positionsSTY),-1)
PD_All_data_NC$Samino <- str_sub(substr(PD_All_data_NC$SequenceSTY,1,PD_All_data_NC$PTM_PositionsSTY),-1)

TPP_All_data_NC_STYA <- TPP_All_data_NC[,c("Amino","pASTY_Score")]
TPP_All_data_NC_STYA$Pipeline <- "TPP"
TPP_All_data_NC_STYA$search <- "pASTY"
TPP_All_data_NC_STY <- TPP_All_data_NC[,c("Samino","pSTY_Score")]
TPP_All_data_NC_STY$Amino <- paste0("p",TPP_All_data_NC_STY$Samino)
TPP_All_data_NC_STY$Pipeline <- "TPP"
TPP_All_data_NC_STY$search <- "pSTY"
TPP_All_data_NC_STY$Samino <- NULL
colnames(TPP_All_data_NC_STYA)[2] <- 'Score'
colnames(TPP_All_data_NC_STY)[1] <- 'Score'


MQ_All_data_NC_STYA <- MQ_All_data_NC[,c("Amino","pASTY_Score")]
MQ_All_data_NC_STYA$Pipeline <- "MQ"
MQ_All_data_NC_STYA$search <- "pASTY"
MQ_All_data_NC_STY <- MQ_All_data_NC[,c("Samino","pSTY_Score")]
MQ_All_data_NC_STY$Amino <- paste0("p",MQ_All_data_NC_STY$Samino)
MQ_All_data_NC_STY$Pipeline <- "MQ"
MQ_All_data_NC_STY$search <- "pSTY"
MQ_All_data_NC_STY$Samino <- NULL
colnames(MQ_All_data_NC_STYA)[2] <- 'Score'
colnames(MQ_All_data_NC_STY)[1] <- 'Score'


PD_All_data_NC_STYA <- PD_All_data_NC[,c("Amino","pASTY_Score")]
PD_All_data_NC_STYA$Pipeline <- "PD"
PD_All_data_NC_STYA$search <- "pASTY"
PD_All_data_NC_STY <- PD_All_data_NC[,c("Samino","pSTY_Score")]
PD_All_data_NC_STY$Amino <- paste0("p",PD_All_data_NC_STY$Samino)
PD_All_data_NC_STY$Pipeline <- "PD"
PD_All_data_NC_STY$search <- "pSTY"
PD_All_data_NC_STY$Samino <- NULL
colnames(PD_All_data_NC_STYA)[2] <- 'Score'
colnames(PD_All_data_NC_STY)[1] <- 'Score'

All_pASTY <- rbind(TPP_All_data_NC_STYA,MQ_All_data_NC_STYA,PD_All_data_NC_STYA)
All_pSTY <- rbind(TPP_All_data_NC_STY,MQ_All_data_NC_STY,PD_All_data_NC_STY)

common_column_names <- intersect(names(All_pSTY), names(All_pASTY))

All_data_NC<- merge(All_pASTY, All_pSTY, by=common_column_names, all=TRUE)

tab1(All_data_NC$Pipeline)

table(All_data_NC$Pipeline,All_data_NC$Amino)

p <- ggplot(data = All_data_NC, aes(x=Amino, y=Score)) + 
  geom_boxplot(aes(fill=search))
p + facet_wrap( ~ Pipeline, scales="free")






